## This is a script to create a SNP datasest given a reference sequence and reads

cd /path/to/working/directory

reference='/path/to/reference.fasta'
picard_path='/path/to/picard'
gatk_path='/path/to/gatk'
vcf2phylip_path='/path/to/vcf2phylip'
CORES= ##adjust this number to fit your machine configuration
occupancy= ##adjust this number to fit the number of taxa you have
module load ##module load all the necessary programs (java, anaconda3, parallel, bwa, samtools)
## adjust the ls commands below to suit your read file naming scheme
## Create index
echo 'creating index'
bwa index $reference
echo DONE
echo


## Get .sai files with bwa
echo 'getting sai'
mkdir -p aln-sai
ls *maerski*1.fq  | cut -d . -f 1 | parallel -j $CORES "bwa aln $reference {}.1.fq > {}.1.sai"
ls *maerski*2.fq  | cut -d . -f 1 | parallel -j $CORES "bwa aln $reference {}.2.fq > {}.2.sai"
ls *Crozet*1.fq   | cut -d . -f 1 | parallel -j $CORES "bwa aln $reference {}.1.fq > {}.1.sai"
ls *Crozet*2.fq   | cut -d . -f 1 | parallel -j $CORES "bwa aln $reference {}.2.fq > {}.2.sai"
ls *Pyrenees*1.fq | cut -d . -f 1 | parallel -j $CORES "bwa aln $reference {}.1.fq > {}.1.sai"
ls *Pyrenees*2.fq | cut -d . -f 1 | parallel -j $CORES "bwa aln $reference {}.2.fq > {}.2.sai"
echo DONE
echo

## Convert .sai to .bam
echo "converting sai to bam"
mkdir -p aln-bam
ls *maerski*1.fq  | cut -d . -f 1 | parallel -j $CORES "bwa sampe $reference {}.1.sai {}.2.sai {}.1.fq {}.2.fq | samtools view -bS - > {}.all.bam"
ls *Crozet*1.fq   | cut -d . -f 1 | parallel -j $CORES "bwa sampe $reference {}.1.sai {}.2.sai {}.1.fq {}.2.fq | samtools view -bS - > {}.all.bam"
ls *Pyrenees*1.fq | cut -d . -f 1 | parallel -j $CORES "bwa sampe $reference {}.1.sai {}.2.sai {}.1.fq {}.2.fq | samtools view -bS - > {}.all.bam"
mv *maerski*sai aln-sai/
mv *Crozet*sai aln-sai/
mv *Pyrenees*sai aln-sai/
echo DONE
echo

## Keep only mapped reads
echo "removing unmapped reads"
mkdir -p mapped
ls *maerski*all.bam  | cut -d . -f 1 | parallel -j $CORES "samtools view -b -F 4 {}.all.bam > {}-mapped-pe.bam"
ls *Crozet*all.bam   | cut -d . -f 1 | parallel -j $CORES "samtools view -b -F 4 {}.all.bam > {}-mapped-pe.bam"
ls *Pyrenees*all.bam | cut -d . -f 1 | parallel -j $CORES "samtools view -b -F 4 {}.all.bam > {}-mapped-pe.bam"
mv *maerski*all.bam aln-bam/
mv *Crozet*all.bam aln-bam/
mv *Pyrenees*all.bam aln-bam/
echo DONE
echo


## Add @RG information to alignments (also sorts data)
echo 'adding @RG info'
mkdir -p RG-added
for i in *-pe.bam;
do
	java -jar ${picard_path}/picard.jar AddOrReplaceReadGroups \
	I="$i" \
	SORT_ORDER=coordinate \
	RGPL=illumina \
	RGPU=D109LACXX \
	RGLB=Lib1 \
	RGID="${i%-pe.bam}" \
	RGSM="${i%-pe.bam}" \
	VALIDATION_STRINGENCY=LENIENT \
	O="${i%-pe.bam}-pe-RG.bam"
done
mv *pe.bam mapped/
echo DONE
echo


## Mark and remove PCR duplicates
echo 'deduping alignments'
mkdir -p deduped
for i *pe-RG.bam;
do
	java -jar ${picard_path}/picard.jar MarkDuplicates \
	INPUT="$i" \
	METRICS_FILE="${i%pe-RG.bam}pe-metricsfile" \
	MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=250 \
	ASSUME_SORTED=true \
	VALIDATION_STRINGENCY=LENIENT \
	REMOVE_DUPLICATES=True \
	OUTPUT="${i%pe-RG.bam}pe-DD.bam"
done
mv *RG.bam RG-added/
echo DONE
echo


## Merge .bam files
echo 'merging bam files'
ls *-DD.bam > names
taxon1=$(awk 'NR==1' names)
taxon2=$(awk 'NR==2' names)
taxon3=$(awk 'NR==3' names)
taxon4=$(awk 'NR==4' names)
taxon5=$(awk 'NR==5' names)
taxon6=$(awk 'NR==6' names)
taxon7=$(awk 'NR==7' names)
taxon8=$(awk 'NR==8' names)
taxon9=$(awk 'NR==9' names)
taxon10=$(awk 'NR==10' names)
taxon11=$(awk 'NR==11' names)
java -jar ${picard_path}/picard.jar MergeSamFiles \
	I=$taxon1 \
	I=$taxon2 \
	I=$taxon3 \
	I=$taxon4 \
	I=$taxon5 \
	I=$taxon6 \
	I=$taxon7 \
	I=$taxon8 \
	I=$taxon9 \
	I=$taxon10 \
	I=$taxon11 \
	ASSUME_SORTED=TRUE \
	MERGE_SEQUENCE_DICTIONARIES=TRUE \
	VALIDATION_STRINGENCY=LENIENT \
	O=merged.bam
mv *DD.bam deduped/
mv *metricsfile deduped/
rm names
echo DONE
echo


## Index merged .bam file
echo 'indexing bam file'
samtools index merged.bam
echo DONE
echo


## Index reference sequences using samtools
echo 'indexing reference'
samtools faidx $reference
echo DONE
echo


## Create a dictionary of reference sequences
echo 'creating dictionary'
base=$(echo $reference | cut -d . -f 1)
java -jar ${picard_path}/picard.jar CreateSequenceDictionary \
	R=$reference \
	O=$base.dict
echo DONE
echo


## Search for indels
echo 'searching for indels'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T RealignerTargetCreator \
	-R $reference \
	-I merged.bam \
	--minReadsAtLocus 4 \
	-S LENIENT \
	-o merged.intervals
echo DONE
echo


## Realign around indels
echo 'realigning'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T IndelRealigner \
	-R $reference \
	-I merged.bam \
	-targetIntervals merged.intervals \
	-LOD 5.0 \
	-S LENIENT \
	-o merged-realigned.bam
echo DONE
echo


## Call snps
echo 'calling SNPs'
# make sure to use GATK version before 3.7
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T UnifiedGenotyper \
	-R $reference \
	-I merged-realigned.bam \
	-gt_mode DISCOVERY \
	-stand_call_conf 30 \
	-stand_emit_conf 10 \
	-S LENIENT \
	-o rawSNPS-Q30.vcf
echo DONE
echo


## Call indels
echo 'calling indels'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T UnifiedGenotyper \
	-R $reference \
	-I merged-realigned.bam \
	-gt_mode DISCOVERY \
	-glm INDEL \
	-stand_call_conf 30 \
	-stand_emit_conf 10 \
	-S LENIENT \
	-o inDels-Q30.vcf
echo DONE
echo
	
	
## Annotate snps
echo 'annotating SNPs'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T VariantAnnotator \
	-R $reference \
	-I merged-realigned.bam \
	-G StandardAnnotation \
	-V:variant,VCF rawSNPS-Q30.vcf \
	-XA SnpEff \
	-S LENIENT \
	-o rawSNPS-Q30-annotated.vcf
echo DONE
echo


## Mask indels
echo 'masking indels'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T VariantFiltration \
	-R $reference \
	-V rawSNPS-Q30-annotated.vcf \
	--mask inDels-Q30.vcf \
	--maskExtension 5 \
	--maskName InDel \
	--clusterWindowSize 10 \
	--filterExpression "MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1)" \
	--filterName "BadValidation" \
	--filterExpression "QUAL < 30.0" \
	--filterName "LowQual" \
	--filterExpression "QD < 5.0" \
	--filterName "LowQualByDepth" \
	--filterExpression "FS > 60" \
	--filterName "FisherStrand" \
	-S LENIENT \
	-o Q30-SNPs.vcf
echo DONE
echo


## Output passing snps
echo 'outputting passing SNPs'
cat Q30-SNPs.vcf | grep 'PASS\|^#' > only-PASS-Q30-SNPs.vcf

## Analyze patterns of covariation in sequence dataset
echo 'analyzing covariation'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T BaseRecalibrator \
	-I merged-realigned.bam \
	-R /$reference \
	-knownSites only-PASS-Q30-SNPs.vcf \
	-knownSites inDels-Q30.vcf \
	-S LENIENT \
	-o recal.table
echo DONE
echo


## Do a second pass to analyze covariation remaining after recalibration
echo 'recalibrating'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T BaseRecalibrator \
	-R /$reference \
	-I merged-realigned.bam \
	-knownSites only-PASS-Q30-SNPs.vcf \
	-knownSites inDels-Q30.vcf \
	-BQSR recal.table \
	-S LENIENT \
	-o after-recal.table
echo DONE
echo


## Apply recalibration to sequence data
echo 'applying recalibration'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T PrintReads \
	-R /$reference \
	-I merged-realigned.bam \
	-BQSR recal.table \
	-S LENIENT \
	-o recal.bam
echo DONE
echo

## Clean up first pass
echo 'cleaning up files'
mkdir -p SNPs_first_pass
mv *Q30* SNPs_first_pass
mv *merged* SNPs_first_pass
echo DONE
echo


## Call snps on recalibrated data but this time emit all sites
echo 'calling SNPs again'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T UnifiedGenotyper \
	-R /$reference \
	-I recal.bam \
	-gt_mode DISCOVERY \
	-stand_call_conf 30 \
	-stand_emit_conf 10 \
	-out_mode EMIT_ALL_SITES \
	-S SILENT \
	-o all-sites-recal.vcf
echo DONE
echo


## Output snps only
echo 'outputting SNPs only'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-R /$reference \
	-T SelectVariants \
	--variant all-sites-recal.vcf \
	-env \
	-o rawSNPS-Q30-recal.vcf
echo DONE
echo


## Annotate snps
echo 'annotating SNPs again'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T VariantAnnotator \
	-R /$reference \
	-I recal.bam \
	-G StandardAnnotation \
	-V:variant,VCF rawSNPS-Q30-recal.vcf \
	-XA SnpEff \
	-S LENIENT \
	-o rawSNPS-Q30-annotated-recal.vcf
echo DONE
echo


## Call indels
echo 'calling indels again'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T UnifiedGenotyper \
	-R /$reference \
	-I recal.bam \
	-gt_mode DISCOVERY \
	-glm INDEL \
	-stand_call_conf 30 \
	-stand_emit_conf 10 \
	-S LENIENT \
	-o inDels-Q30-recal.vcf
echo DONE
echo


## Mask indels
echo 'masking indels again'
java -jar ${gatk_path}/GenomeAnalysisTK.jar \
	-T VariantFiltration \
	-R /$reference \
	-V rawSNPS-Q30-annotated-recal.vcf \
	--mask inDels-Q30-recal.vcf \
	--maskExtension 5 \
	--maskName InDel \
	--clusterWindowSize 10 \
	--filterExpression "MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1)" \
	--filterName "BadValidation" \
	--filterExpression "QUAL < 30.0" \
	--filterName "LowQual" \
	--filterExpression "QD < 5.0" \
	--filterName "LowQualByDepth" \
	--filterExpression "FS > 60" \
	--filterName "FisherStrand" \
	-S LENIENT \
	-o Q30-SNPs-recal.vcf
echo DONE
echo


## Output only passing snps
echo 'outputting passing SNPs again'
cat Q30-SNPs-recal.vcf | grep 'PASS\|^#' > PASS-Q30-SNPs-recal.vcf
echo DONE
echo


## Convert to phylip
echo 'converting to phylip'
python ${vcf2phylip_path}/vcf2phylip.py -i PASS-Q30-SNPs-recal.vcf -m $occupancy  --output-prefix final_SNPs
echo DONE
echo

## Cleaning up directory
echo 'cleaning up files'
mkdir -p SNPs_second_pass
mv *recal* SNPs_second_pass
echo DONE
echo

echo 'Pipeline Completed!'
